In [1]:
import pandas as pd
import plotly.express as px
import numpy as np
import plotly.graph_objects as go
import sqlite3 as sq3
import plotly.offline as pyo
In [2]:
#Q1
df= pd.read_csv("https://raw.githubusercontent.com/smart-stats/ds4bio_book/main/book/assetts/kirby21AllLevels.csv")
df.head(4)
Out[2]:
Unnamed: 0 rawid roi volume min max mean std type level id icv tbv
0 1 kirby127a_3_1_ax.img Telencephalon_L 531111 0.0 374.0 128.3013 51.8593 1 1 127 1378295 1268519
1 2 kirby127a_3_1_ax.img Telencephalon_R 543404 0.0 300.0 135.0683 53.6471 1 1 127 1378295 1268519
2 3 kirby127a_3_1_ax.img Diencephalon_L 9683 15.0 295.0 193.5488 32.2733 1 1 127 1378295 1268519
3 4 kirby127a_3_1_ax.img Diencephalon_R 9678 10.0 335.0 193.7051 32.7869 1 1 127 1378295 1268519
In [3]:
df = df.loc[(df.type == 1) & (df.rawid== "kirby127a_3_1_ax.img")]
df = df.assign(comp = df.volume / np.sum(df.volume))
df.head()
Out[3]:
Unnamed: 0 rawid roi volume min max mean std type level id icv tbv comp
0 1 kirby127a_3_1_ax.img Telencephalon_L 531111 0.0 374.0 128.3013 51.8593 1 1 127 1378295 1268519 0.077066
1 2 kirby127a_3_1_ax.img Telencephalon_R 543404 0.0 300.0 135.0683 53.6471 1 1 127 1378295 1268519 0.078850
2 3 kirby127a_3_1_ax.img Diencephalon_L 9683 15.0 295.0 193.5488 32.2733 1 1 127 1378295 1268519 0.001405
3 4 kirby127a_3_1_ax.img Diencephalon_R 9678 10.0 335.0 193.7051 32.7869 1 1 127 1378295 1268519 0.001404
4 5 kirby127a_3_1_ax.img Mesencephalon 10268 55.0 307.0 230.8583 29.2249 1 1 127 1378295 1268519 0.001490
In [4]:
level1 = df.loc[(df.level == 1)]
group1= level1.groupby('roi')
group1.head(4)
Out[4]:
Unnamed: 0 rawid roi volume min max mean std type level id icv tbv comp
0 1 kirby127a_3_1_ax.img Telencephalon_L 531111 0.0 374.0 128.3013 51.8593 1 1 127 1378295 1268519 0.077066
1 2 kirby127a_3_1_ax.img Telencephalon_R 543404 0.0 300.0 135.0683 53.6471 1 1 127 1378295 1268519 0.078850
2 3 kirby127a_3_1_ax.img Diencephalon_L 9683 15.0 295.0 193.5488 32.2733 1 1 127 1378295 1268519 0.001405
3 4 kirby127a_3_1_ax.img Diencephalon_R 9678 10.0 335.0 193.7051 32.7869 1 1 127 1378295 1268519 0.001404
4 5 kirby127a_3_1_ax.img Mesencephalon 10268 55.0 307.0 230.8583 29.2249 1 1 127 1378295 1268519 0.001490
5 6 kirby127a_3_1_ax.img Metencephalon 159402 2.0 299.0 138.5200 52.2241 1 1 127 1378295 1268519 0.023130
6 7 kirby127a_3_1_ax.img Myelencephalon 4973 12.0 286.0 199.8497 36.6501 1 1 127 1378295 1268519 0.000722
7 8 kirby127a_3_1_ax.img CSF 109776 0.0 258.0 33.0193 26.3262 1 1 127 1378295 1268519 0.015929
In [5]:
level2 = df.loc[(df.level == 2)]
group2= level2.groupby('roi')
group2.head(4)
Out[5]:
Unnamed: 0 rawid roi volume min max mean std type level id icv tbv comp
8 9 kirby127a_3_1_ax.img CerebralCortex_L 276965 0.0 249.0 88.5014 27.7510 1 2 127 1378295 1268519 0.040188
9 10 kirby127a_3_1_ax.img CerebralCortex_R 282858 0.0 295.0 93.7283 28.1726 1 2 127 1378295 1268519 0.041044
10 11 kirby127a_3_1_ax.img CerebralNucli_L 12380 14.0 273.0 165.5364 28.7978 1 2 127 1378295 1268519 0.001796
11 12 kirby127a_3_1_ax.img CerebralNucli_R 13073 16.0 273.0 168.7348 30.4016 1 2 127 1378295 1268519 0.001897
12 13 kirby127a_3_1_ax.img Thalamus_L 6342 38.0 268.0 192.1482 24.4523 1 2 127 1378295 1268519 0.000920
13 14 kirby127a_3_1_ax.img Thalamus_R 6372 34.0 265.0 194.3232 27.0116 1 2 127 1378295 1268519 0.000925
14 15 kirby127a_3_1_ax.img BasalForebrain_L 3341 15.0 295.0 196.2076 43.2780 1 2 127 1378295 1268519 0.000485
15 16 kirby127a_3_1_ax.img BasalForebrain_R 3306 10.0 335.0 192.5140 41.6949 1 2 127 1378295 1268519 0.000480
16 17 kirby127a_3_1_ax.img Mesencephalon_L 4943 57.0 296.0 228.3875 27.4134 1 2 127 1378295 1268519 0.000717
17 18 kirby127a_3_1_ax.img Mesencephalon_R 5325 55.0 307.0 233.1521 30.6334 1 2 127 1378295 1268519 0.000773
18 19 kirby127a_3_1_ax.img Metencephalon_R 79910 2.0 299.0 144.9297 53.1512 1 2 127 1378295 1268519 0.011595
19 20 kirby127a_3_1_ax.img Metencephalon_L 79493 3.0 275.0 132.0767 50.4612 1 2 127 1378295 1268519 0.011535
20 21 kirby127a_3_1_ax.img Myelencephalon_L 2403 25.0 283.0 196.0509 37.1295 1 2 127 1378295 1268519 0.000349
21 22 kirby127a_3_1_ax.img Myelencephalon_R 2571 12.0 286.0 203.4001 35.8340 1 2 127 1378295 1268519 0.000373
22 23 kirby127a_3_1_ax.img WhiteMatter_L 241767 2.0 374.0 171.9891 34.4828 1 2 127 1378295 1268519 0.035081
23 24 kirby127a_3_1_ax.img WhiteMatter_R 247473 10.0 300.0 180.5410 35.8129 1 2 127 1378295 1268519 0.035909
24 25 kirby127a_3_1_ax.img Ventricle 45791 0.0 258.0 37.3184 34.2745 1 2 127 1378295 1268519 0.006644
25 26 kirby127a_3_1_ax.img Sulcus_L 33971 0.0 137.0 29.4653 17.5085 1 2 127 1378295 1268519 0.004929
26 27 kirby127a_3_1_ax.img Sulcus_R 30015 0.0 161.0 30.4828 18.6214 1 2 127 1378295 1268519 0.004355
In [6]:
level3 = df.loc[(df.level == 3)]
group3= level3.groupby('roi').mean('volume')
group3.head(4)
Out[6]:
Unnamed: 0 volume min max mean std type level id icv tbv comp
roi
AnteriorWM_L 54.0 87012.0 9.0 374.0 172.2390 33.3689 1.0 3.0 127.0 1378295.0 1268519.0 0.012626
AnteriorWM_R 55.0 91755.0 11.0 278.0 172.5715 31.9999 1.0 3.0 127.0 1378295.0 1268519.0 0.013314
BasalForebrain_L 44.0 3341.0 15.0 295.0 196.2076 43.2780 1.0 3.0 127.0 1378295.0 1268519.0 0.000485
BasalForebrain_R 45.0 3306.0 10.0 335.0 192.5140 41.6949 1.0 3.0 127.0 1378295.0 1268519.0 0.000480
In [7]:
level4 = df.loc[(df.level == 4)]
group4= level4.groupby('roi').mean('volume')
group4.head(4)
Out[7]:
Unnamed: 0 volume min max mean std type level id icv tbv comp
roi
AG_L 100.0 7430.0 4.0 159.0 75.3894 21.2562 1.0 4.0 127.0 1378295.0 1268519.0 0.001078
AG_R 101.0 13580.0 1.0 173.0 83.2116 25.0679 1.0 4.0 127.0 1378295.0 1268519.0 0.001970
ALIC_L 162.0 2344.0 142.0 278.0 222.3963 21.9805 1.0 4.0 127.0 1378295.0 1268519.0 0.000340
ALIC_R 163.0 2528.0 131.0 263.0 222.2013 18.3836 1.0 4.0 127.0 1378295.0 1268519.0 0.000367
In [8]:
label = ["icv", "level 1", "level 2", "level 3", "level 4", "Diencephalon_L", "Diencephalon_R", "CerebralCortex_L", "CerebralCortex_R", "AnteriorWM_L", "AnteriorWM_R", "AG_L", "AG_R"]
source = [0, 0, 0, 0,
          1, 1,  
          2, 2,
          3, 3,
          4, 4]
target = [ 1, 2, 3, 4,
          5, 6, 
          7, 8,
          9, 10,
         11, 12]
value = [1378295, 1378295, 1378295, 1378295,
        531111,543404,
        276965, 282858, 
        87012,91755, 
        7430,13580]
In [9]:
link= dict(source = source, target = target, value= value)
node = dict(label = label, pad=15, thickness=5)
df = go.Sankey(link = link, node=node)
fig = go.Figure(df)
fig.show()
In [10]:
pyo.plot(fig, filename='figure1plot.html')
Out[10]:
'figure1plot.html'
In [11]:
import sqlite3 as sq3
import pandas as pd
In [12]:
#Q3

con = sq3.connect("C:\sqlite3/opioid.db")
population_info = pd.read_sql_query("SELECT * from population", con)
con.close
population_info.head(4)
Out[12]:
? BUYER_COUNTY BUYER_STATE countyfips STATE COUNTY county_name NAME variable year population
0 1 AUTAUGA AL 01001 1 1 Autauga Autauga County, Alabama B01003_001 2006 51328
1 2 BALDWIN AL 01003 1 3 Baldwin Baldwin County, Alabama B01003_001 2006 168121
2 3 BARBOUR AL 01005 1 5 Barbour Barbour County, Alabama B01003_001 2006 27861
3 4 BIBB AL 01007 1 7 Bibb Bibb County, Alabama B01003_001 2006 22099
In [13]:
con = sq3.connect("C:\sqlite3/opioid.db")
annual_info = pd.read_sql_query("SELECT * from annual", con)
con.close
annual_info.head
Out[13]:
<bound method NDFrame.head of            ? BUYER_COUNTY BUYER_STATE  year count DOSAGE_UNIT countyfips
0          1    ABBEVILLE          SC  2006   877      363620      45001
1          2    ABBEVILLE          SC  2007   908      402940      45001
2          3    ABBEVILLE          SC  2008   871      424590      45001
3          4    ABBEVILLE          SC  2009   930      467230      45001
4          5    ABBEVILLE          SC  2010  1197      539280      45001
...      ...          ...         ...   ...   ...         ...        ...
27753  27754           NA          NV  2007   447      200600         NA
27754  27755           NA          NV  2008     5        2200         NA
27755  27756           NA          OH  2006    23        5100         NA
27756  27757           NA          PR  2006    10       17800         NA
27757  27758           NA          PR  2007     2        1300         NA

[27758 rows x 7 columns]>
In [14]:
annual= annual_info.loc[(annual_info.countyfips == 'NA')]
annual.head()
Out[14]:
? BUYER_COUNTY BUYER_STATE year count DOSAGE_UNIT countyfips
187 188 ADJUNTAS PR 2006 147 102800 NA
188 189 ADJUNTAS PR 2007 153 104800 NA
189 190 ADJUNTAS PR 2008 153 45400 NA
190 191 ADJUNTAS PR 2009 184 54200 NA
191 192 ADJUNTAS PR 2010 190 56200 NA
In [15]:
annual= annual.loc[(annual.BUYER_STATE != "PR") & (annual.countyfips == 'NA')]
annual.head()
Out[15]:
? BUYER_COUNTY BUYER_STATE year count DOSAGE_UNIT countyfips
10071 10072 GUAM GU 2006 319 265348 NA
10072 10073 GUAM GU 2007 330 275600 NA
10073 10074 GUAM GU 2008 313 286900 NA
10074 10075 GUAM GU 2009 390 355300 NA
10075 10076 GUAM GU 2010 510 413800 NA
In [16]:
annual = annual.loc[(annual.BUYER_STATE == "AR") & (annual.BUYER_COUNTY == "MONTGOMERY")].replace({"NA":"5097"})
annual.head()
Out[16]:
? BUYER_COUNTY BUYER_STATE year count DOSAGE_UNIT countyfips
17429 17430 MONTGOMERY AR 2006 469 175390 5097
17430 17431 MONTGOMERY AR 2007 597 241270 5097
17431 17432 MONTGOMERY AR 2008 561 251760 5097
17432 17433 MONTGOMERY AR 2009 554 244160 5097
17433 17434 MONTGOMERY AR 2010 449 247990 5097
In [17]:
annual_info.loc[(annual_info['BUYER_STATE'] == 'AR') & (annual_info['BUYER_COUNTY'] == 'MONTGOMERY'), 'countyfips'] = '05097'
annual_info.head()
Out[17]:
? BUYER_COUNTY BUYER_STATE year count DOSAGE_UNIT countyfips
0 1 ABBEVILLE SC 2006 877 363620 45001
1 2 ABBEVILLE SC 2007 908 402940 45001
2 3 ABBEVILLE SC 2008 871 424590 45001
3 4 ABBEVILLE SC 2009 930 467230 45001
4 5 ABBEVILLE SC 2010 1197 539280 45001
In [18]:
annual_info= annual_info.loc[annual_info['BUYER_COUNTY'] != "NA"]
annual_info.head()
Out[18]:
? BUYER_COUNTY BUYER_STATE year count DOSAGE_UNIT countyfips
0 1 ABBEVILLE SC 2006 877 363620 45001
1 2 ABBEVILLE SC 2007 908 402940 45001
2 3 ABBEVILLE SC 2008 871 424590 45001
3 4 ABBEVILLE SC 2009 930 467230 45001
4 5 ABBEVILLE SC 2010 1197 539280 45001
In [19]:
con = sq3.connect("C:\sqlite3/opioid.db")
land_info = pd.read_sql_query("SELECT * from land", con)
con.close
land_info.head()
Out[19]:
? Areaname STCOU LND010190F LND010190D LND010190N1 LND010190N2 LND010200F LND010200D LND010200N1 ... LND110210N1 LND110210N2 LND210190F LND210190D LND210190N1 LND210190N2 LND210200F LND210200D LND210200N1 LND210200N2
0 1 UNITED STATES 00000 0 3787425.08 0000 0000 0 3794083.06 0000 ... 0000 0000 0 251083.35 0000 0000 0 256644.62 0000 0000
1 2 ALABAMA 01000 0 52422.94 0000 0000 0 52419.02 0000 ... 0000 0000 0 1672.71 0000 0000 0 1675.01 0000 0000
2 3 Autauga, AL 01001 0 604.49 0000 0000 0 604.45 0000 ... 0000 0000 0 8.48 0000 0000 0 8.48 0000 0000
3 4 Baldwin, AL 01003 0 2027.08 0000 0000 0 2026.93 0000 ... 0000 0000 0 430.55 0000 0000 0 430.58 0000 0000
4 5 Barbour, AL 01005 0 904.59 0000 0000 0 904.52 0000 ... 0000 0000 0 19.59 0000 0000 0 19.61 0000 0000

5 rows × 35 columns

In [20]:
land = land_info.loc[:,['Areaname', 'STCOU', 'LND110210D']]
land.head(10)
Out[20]:
Areaname STCOU LND110210D
0 UNITED STATES 00000 3531905.43
1 ALABAMA 01000 50645.33
2 Autauga, AL 01001 594.44
3 Baldwin, AL 01003 1589.78
4 Barbour, AL 01005 884.88
5 Bibb, AL 01007 622.58
6 Blount, AL 01009 644.78
7 Bullock, AL 01011 622.81
8 Butler, AL 01013 776.83
9 Calhoun, AL 01015 605.87
In [21]:
land= land.rename(columns = {'STCOU':'countyfips'})
land.head(10)
Out[21]:
Areaname countyfips LND110210D
0 UNITED STATES 00000 3531905.43
1 ALABAMA 01000 50645.33
2 Autauga, AL 01001 594.44
3 Baldwin, AL 01003 1589.78
4 Barbour, AL 01005 884.88
5 Bibb, AL 01007 622.58
6 Blount, AL 01009 644.78
7 Bullock, AL 01011 622.81
8 Butler, AL 01013 776.83
9 Calhoun, AL 01015 605.87
In [22]:
population_info.merge(land, on='countyfips', how='left')
Out[22]:
? BUYER_COUNTY BUYER_STATE countyfips STATE COUNTY county_name NAME variable year population Areaname LND110210D
0 1 AUTAUGA AL 01001 1 1 Autauga Autauga County, Alabama B01003_001 2006 51328 Autauga, AL 594.44
1 2 BALDWIN AL 01003 1 3 Baldwin Baldwin County, Alabama B01003_001 2006 168121 Baldwin, AL 1589.78
2 3 BARBOUR AL 01005 1 5 Barbour Barbour County, Alabama B01003_001 2006 27861 Barbour, AL 884.88
3 4 BIBB AL 01007 1 7 Bibb Bibb County, Alabama B01003_001 2006 22099 Bibb, AL 622.58
4 5 BLOUNT AL 01009 1 9 Blount Blount County, Alabama B01003_001 2006 55485 Blount, AL 644.78
... ... ... ... ... ... ... ... ... ... ... ... ... ...
28260 28261 WASHAKIE WY 56043 56 43 Washakie Washakie County, Wyoming B01003_001 2014 8444 Washakie, WY 2238.55
28261 28262 WESTON WY 56045 56 45 Weston Weston County, Wyoming B01003_001 2014 7135 Weston, WY 2398.09
28262 28263 SKAGWAY AK 02230 2 230 Skagway Skagway Municipality, Alaska B01003_001 2014 996 Skagway, AK 452.33
28263 28264 HOONAH ANGOON AK 02105 2 105 Hoonah Angoon Hoonah-Angoon Census Area, Alaska B01003_001 2014 2126 Hoonah-Angoon, AK 7524.92
28264 28265 PETERSBURG AK 02195 2 195 Petersburg Petersburg Borough, Alaska B01003_001 2014 3212 Petersburg, AK 3281.98

28265 rows × 13 columns

In [23]:
land_info.shape
Out[23]:
(3198, 35)
In [24]:
land.shape
Out[24]:
(3198, 3)
In [25]:
population_info.shape
Out[25]:
(28265, 11)
In [26]:
annual_info.shape
Out[26]:
(27741, 7)
In [27]:
#Q4
summary = annual_info[['countyfips', 'year', 'DOSAGE_UNIT']]
summary
Out[27]:
countyfips year DOSAGE_UNIT
0 45001 2006 363620
1 45001 2007 402940
2 45001 2008 424590
3 45001 2009 467230
4 45001 2010 539280
... ... ... ...
27736 48507 2010 200100
27737 48507 2011 244800
27738 48507 2012 263700
27739 48507 2013 186700
27740 48507 2014 148930

27741 rows × 3 columns

In [28]:
summary_avg = summary.groupby(['year', 'countyfips'])['DOSAGE_UNIT'].mean().reset_index()
summary_avg.head()
Out[28]:
year countyfips DOSAGE_UNIT
0 2006 01001 2277140.0
1 2006 01003 6353798.0
2 2006 01005 827060.0
3 2006 01007 754210.0
4 2006 01009 1290295.0
In [29]:
fig2 = px.scatter(summary_avg, x = "year", y = "DOSAGE_UNIT", color = "countyfips", title='Average count of opioid pills per year in millions')
figure2 = go.Figure(fig2)
figure2.show()
In [30]:
pyo.plot(figure2, filename='figure2plot.html')
Out[30]:
'figure2plot.html'
In [ ]:
##Q2 & Q4 link for public html with graphs 
https://azeher1.github.io/